%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable
from Bio import Seq, SeqIO, Align, AlignIO, Phylo, Alphabet, pairwise2
from Bio.SeqRecord import SeqRecord
from Bio.Align import AlignInfo, Applications
from Bio.Phylo import draw, TreeConstruction # TreeConstruction.DistanceTreeConstructor
# https://bioinformatics.stackexchange.com/questions/4337/ \
#biopython-phylogenetic-tree-replace-branch-tip-labels-by-sequence-logos
import numpy as np
import seaborn as sns
from sklearn import manifold, metrics, cluster, neighbors, decomposition, preprocessing
import skbio, parasail, dendropy, pandas
import sys, gzip, re, glob, pickle, collections, subprocess, os, errno, random, itertools
def print_redblack(textr, textbb="", textbl=""):
print ('\x1b[0;1;31;1m'+ str(textr) + '\x1b[0;1;30;1m'+ str(textbb) + '\x1b[0;0;30;0m'+ str(textbl) + '\x1b[0m')
genus_species_accession (where the binomial by default, i.e. notebook 026, is GTDB), and gets a new genus_species based on accession using other columns of GTDB table:| column | taxon database |
|---|---|
| 0 | GTDB |
| 1 | SILVA 23S |
| 3 | NCBI |
| 4 | SILVA 16S |
gbff_dir = "/media/deolivl/QIB_deolivl/bigdata/" ## directory (or directories, in our case) with refseq genomes
#gbff_dir = "./bigdata/"
tmp_table = [line.strip() for line in open("./bigdata/gtdb_list.csv", 'r')]
print_redblack("header: <file> ", tmp_table[0].split(';')[1:]) ## first column is data.frame index from R
tmp_table = [line.split(';')[1:] for line in tmp_table[1:]] ## skip first line, with headers
sp_tbl_dict = { str(re.search('GCF_(\d+)\.', x[0]).group(1)): x[1:] for x in tmp_table} # key:value
k = list(sp_tbl_dict)[1]; print ("first element of list: ", k, sp_tbl_dict[k], "\n")
## receives a leaf name and returns a new leaf name preserving accession but changing taxonomy DB
def leaf_name_from_leaf_name (leaf_str, tbl_col = 3):
accession = leaf_str.split(".")[2]
try:
binstr = ".".join(sp_tbl_dict[accession][tbl_col].split()[:2])
except:
binstr = "unknown.unknown"
binstr += "." + accession
return binstr
def change_leaf_names_from_db (tree, tbl_col=3):
for tx in tree.taxon_namespace:
tx.label = leaf_name_from_leaf_name (tx.label, tbl_col)
return tree
outdir = "./026_results/"
ordered_types = ["5S","16Sv1v2","16Sv3v4","16S","23S","16S5S","23S5S","16S23S","16S23S5S"]
outfile_list = [outdir + str(keys) for keys in ordered_types] # for this data set we know order :D
print (outfile_list)
def patristic_distances_from_treefile (filename, tbl_col = -1, have_paralogs = False, shuffle = False):
tree = dendropy.Tree.get(path=filename, schema="newick", preserve_underscores=True)
if (tbl_col >= 0):
tree = change_leaf_names_from_db (tree, tbl_col)
if shuffle:
tree = dendropy.simulate.treesim.pure_kingman_tree(taxon_namespace=tree.taxon_namespace, pop_size = 1000)
species = ['.'.join(t.label.split('.')[:2]) for t in tree.taxon_namespace] ## follow order of taxon_namespace
ntaxa = len(tree.taxon_namespace)
distmat = np.zeros((ntaxa,ntaxa)) # diagonals are zero
nodemat = np.zeros((ntaxa,ntaxa))
## STEP 1: pairwise distances along the tree
pdm = tree.phylogenetic_distance_matrix() # initialises class
for i,j in itertools.combinations(range(ntaxa),2):
distmat[i,j] = distmat[j,i] = pdm.distance(tree.taxon_namespace[i], tree.taxon_namespace[j])
nodemat[i,j] = nodemat[j,i] = pdm.path_edge_count(tree.taxon_namespace[i], tree.taxon_namespace[j])
## STEP 2: silhouette score using pairwise distances and taxonomic information
mdist = metrics.silhouette_samples(distmat, species, metric="precomputed")
mnode = metrics.silhouette_samples(nodemat, species, metric="precomputed")
if have_paralogs:
sample = ['.'.join(t.label.split('.')[:3]) for t in tree.taxon_namespace]
m3 = metrics.silhouette_samples(distmat, sample, metric="precomputed")
m4 = metrics.silhouette_samples(nodemat, sample, metric="precomputed")
return mdist, mnode, m3, m4
return mdist, mnode
def silhouette_str(distrib):
return '{:7.3f}'.format(np.percentile(distrib, 5)) + \
'{:7.3f}'.format(np.percentile(distrib, 25)) + \
'{:7.3f}'.format(np.percentile(distrib, 50)) + \
'{:7.3f}'.format(np.percentile(distrib, 75)) + \
'{:7.3f} '.format(sum(distrib>0)/float(len(distrib)))
silhouette_str_head="5% 25% 50% 75% proportion_positives <-- weighted | unweighted -->"
mdis, mnod = patristic_distances_from_treefile (outfile_list[0]+ "_long.fasta.treefile", shuffle = True)
# bar plot uses path difference
bplot_label = ["random"] ; bplot_hue = ["proportion of positive silhouette scores"];
bplot_value = [sum(mnod>0)/float(len(mnod))]
# violin plots use weighted distances
vio_silho_x = ["random"] * mdis.shape[0]; vio_silho_y = list(mdis)
print (silhouette_str_head)
print (' {:9s}'.format("random"), "\t", silhouette_str(mdis), silhouette_str(mnod))
suffix1 = ".fasta.treefile"
for suffix2, title in zip (["_long", "_consensus"], ["long ", "cons "]):
#for suffix2, title2 in zip (["_consensus"], ["cons "]): ## testing, on consensus only
suffix = suffix2 + suffix1;
for fname, rname in zip(outfile_list, [i.split('/')[-1] for i in outfile_list]):
mdis, mnod = patristic_distances_from_treefile (fname + suffix)
print (title + '{:6s}'.format(rname), "\t", silhouette_str(mdis), silhouette_str(mnod))
if "long" in title:
bplot_label.append(rname) ; bplot_hue.append("proportion of positive silhouette scores");
bplot_value.append(sum(mnod>0)/float(len(mnod)))
vio_silho_x.extend([str(rname)] * mdis.shape[0]); vio_silho_y.extend(list(mdis))
print("\t_")
We have one value per species for all scores.
def sp_list_from_labels (labels):
return ['.'.join(t.split('.')[:2]) for t in labels]
def genera_set_from_namespace (txnamespace):
x = [t.label.split('.')[0] for t in txnamespace]
return set(x)
def taxa_list_from_strain (leaves, strain): # strain is list of species or genera
return [x for x in leaves if strain in x]
def average_lengths_lca (samples, tre, pdm, lca): ## assumes names are unique
taxon_label_list = [tre.find_node_with_taxon_label(x) for x in samples]
avge = {}
for t in taxon_label_list:
if t.label not in avge: ## not calculated yet
is_mono = True
mono_taxa = None
while is_mono and t is not lca:
t = t.parent_node
taxa_below = [x for x in tre.taxon_namespace.bitmask_taxa_list(t.leafset_bitmask)]
is_mono = all([i.label in samples for i in taxa_below])
if (is_mono):
mono_taxa = [x for x in taxa_below]
if mono_taxa:
pairdist = np.mean([pdm.distance(i, j) for i,j in itertools.combinations(mono_taxa,2)])
for i in mono_taxa:
avge[i.label] = pairdist
else:
avge[t.label] = 0.
all_avges = [v for k,v in avge.items()]
return max(all_avges), np.mean(all_avges)
def monophyly_score_from_treefile (filename, tbl_col = -1, shuffle = False):
tree = dendropy.Tree.get(path=filename, schema="newick", preserve_underscores=True)
if (tbl_col >= 0):
tree = change_leaf_names_from_db (tree, tbl_col)
if shuffle:
x = tree.length()
tree = dendropy.simulate.treesim.pure_kingman_tree(taxon_namespace=tree.taxon_namespace, pop_size = 5000)
for edge in tree.preorder_edge_iter():
edge.length = edge.length/x
else:
tree.reroot_at_midpoint(update_bipartitions=True)
tree.encode_bipartitions()
pdm = tree.phylogenetic_distance_matrix() # initialises patristic distance class
leaf_list = [x.label for x in tree.taxon_namespace]
freq_species = [] ; monoscore = []; max_avge = []; avge_avge = []
for sp in set(sp_list_from_labels(leaf_list)): # for each species
samples_from_sp = taxa_list_from_strain (leaf_list, sp)
mrca = tree.mrca(taxon_labels=samples_from_sp) # MRCA node of rooted tree is MRCA edge of unrooted tree
samples_below_mrca = [x.label for x in tree.taxon_namespace.bitmask_taxa_list(mrca.leafset_bitmask)]
samples_above_mrca = [x for x in leaf_list if x not in samples_below_mrca] # unrooted tree
try:
m1 = len(samples_from_sp)/len(samples_below_mrca)
except:
m1 = len(samples_from_sp)/len(samples_above_mrca)
monoscore.append(m1)
try:
m1 = 1./len(set(sp_list_from_labels(samples_below_mrca)))
except:
m1 = 1./len(set(sp_list_from_labels(samples_above_mrca)))
freq_species.append(m1)
x1, x2 = average_lengths_lca (samples_from_sp, tree, pdm, mrca)
max_avge.append(x1)
avge_avge.append(x2)
#print (samples_above_mrca)
return monoscore, freq_species, max_avge, avge_avge
def mono_str(mono_, freq_, mav_, avv_): ## was 25 and 50, now 5 and 25
return '{:6.3f} '.format(np.percentile(mono_, 5)) + \
'{:6.3f} '.format(np.percentile(mono_, 10)) + \
'{:6.3f} '.format(np.percentile(mono_, 25)) + \
'{:6.3f}\t'.format(sum(np.array(freq_) > 0.9)/float(len(freq_))) + \
'{:.7f} '.format(np.percentile(mav_, 50)) + \
'{:.7f} '.format(np.percentile(mav_, 95)) + \
'{:.7f} '.format(np.percentile(avv_, 50)) + \
'{:.7f} '.format(np.percentile(avv_, 95))
mono_str_head="\tmonoph_5% monoph_10% monoph_25% prop_species>0.9 \t max_avge_50% max_avge_95% mean_avge_50% mean_avge_95%\n"
mono, fspec, mav, avv = monophyly_score_from_treefile (outfile_list[0]+ "_consensus.fasta.treefile", shuffle = True)
bplot_label.append("random") ; bplot_hue.append("proportion of monophyletic groups")
bplot_value.append(sum(np.array(fspec) > 0.999)/float(len(fspec)))
vio_mono_x = ["random"] * len(mono)
vio_mono_y = mono; vio_mav_y = mav
print (mono_str_head)
print ('{:9s}'.format("random"), "\t", mono_str(mono, fspec, mav, avv))
suffix1 = ".fasta.treefile"
for suffix2, title in zip (["_long", "_consensus"], ["long ", "cons "]):
suffix = suffix2 + suffix1;
for fname, rname in zip(outfile_list, [i.split('/')[-1] for i in outfile_list]):
mono, fspec, mav, avv = monophyly_score_from_treefile (fname + suffix)
print (title + '{:7s}'.format(rname), "\t", mono_str(mono, fspec, mav, avv))
if "long" in title:
bplot_label.append(rname) ; bplot_hue.append("proportion of monophyletic groups")
bplot_value.append(sum(np.array(fspec) > 0.999)/float(len(fspec)))
vio_mono_x.extend([rname] * len(mono))
vio_mono_y.extend(mono); vio_mav_y.extend(mav);
print("\t_")
fig, axes = plt.subplots(3,1) ; fig.set_size_inches(16, 24);
matplotlib.rcParams['figure.dpi'] = 200
matplotlib.rc('font', weight='bold')
fig.subplots_adjust(top=.91, bottom=.01, left=.02, right=.99, wspace=.2, hspace=.2)
sns.set();
sns.set_palette("cubehelix", 12);
sns.boxenplot(x=vio_silho_x, y=vio_silho_y, ax = axes[0], outlier_prop=0.00001)
axes[0].set_ylabel("tree silhouette scores", fontsize=22, weight="bold") # "silhouette scores from patristic distances"
axes[0].set_xticklabels(axes[0].get_xticklabels(),rotation=40)
axes[0].tick_params(labelsize=16)
sns.set(); sns.set_palette("cubehelix", 12)
sns.pointplot(x=vio_mono_x, y=vio_mono_y, ax = axes[1], linestyles="--", errwidth=0.5, color="gray", scale=0.4)
sns.swarmplot(x=vio_mono_x, y=vio_mono_y, ax = axes[1], size=10, linewidth=1)
axes[1].set_ylabel("monophyly scores", fontsize=22, weight="bold")
axes[1].set_xticklabels(axes[1].get_xticklabels(),rotation=40)
axes[1].tick_params(labelsize=16)
sns.set(); sns.set_palette("cubehelix", 12) # "Average distance between samples from best monophyletic subtree"
sns.boxplot(x=vio_mono_x, y=vio_mav_y, ax = axes[2])
axes[2].set_ylabel("Average clade distance", fontsize=22, weight="bold")
axes[2].set_xticklabels(axes[2].get_xticklabels(),rotation=40)
#axes[2].set_ylim(-0.0001,0.015)
axes[2].tick_params(labelsize=16)
#sns.boxplot(x=rnatypelist, y=monolist_sp, ax = axes[1]) # inner="sticks",
#sns.pointplot(x=rnatypelist, y=monolist_sp, ax = axes[1])
fig, axes = plt.subplots(1) ; fig.set_size_inches(12, 6);
fig.subplots_adjust(top=.99, bottom=.01, left=.01, right=.99, wspace=.1, hspace=.1)
sns.set(); sns.set_palette("cubehelix", 12)
sns.barplot(hue=bplot_label, y=bplot_value, x=bplot_hue, ax=axes)
axes.set_ylabel("proportion", fontsize=18, fontweight="bold")
axes.tick_params(labelsize=14)
#axes.set_title("proportion of sample pairs with good silhouettes and proportion of monophyletic groups")
fig, axes = plt.subplots(1) ; fig.set_size_inches(12, 5);
fig.subplots_adjust(top=.99, bottom=.01, left=.01, right=.99, wspace=.1, hspace=.1)
sns.set(); sns.set_palette("cubehelix", 12) # "Average distance between samples from best monophyletic subtree"
sns.boxplot(x=vio_mono_x, y=vio_mav_y, ax = axes)
axes.set_ylabel("Average clade distance (y axis truncated)", fontsize=16, weight="bold")
axes.set_xticklabels(axes.get_xticklabels(),rotation=40)
axes.set_ylim(-0.0001,0.024)
axes.tick_params(labelsize=13)
rand_tfile = outfile_list[0]+ "_long.fasta.treefile" # _consensus or _long, we just use leaf names
suffix1 = ".fasta.treefile"
#suffix1 = "_upgma.treefile"
suffix2 = "_long"
#suffix2 = "_consensus"
suffix = suffix2 + suffix1;
table_col = 0 ## 0 GTDB, 1 silvaLSU, 3 NCBI, or 4 silvaSSU
title_list = ["GTDB", "LSU SILVA", "unused", "NCBI", "SSU SILVA"]
## Silhouette scores
mdis, mnod = patristic_distances_from_treefile (rand_tfile, tbl_col = table_col, shuffle = True)
bplot_label = ["random"] ; bplot_hue = ["proportion of positive silhouette scores"];
bplot_value = [sum(mnod>0)/float(len(mnod))]
# violin plots use weighted and unweighted distances (unweighted is suppl info; above we use only weighted)
vio_silho_x_w = ["random"] * mdis.shape[0]; vio_silho_y_w = list(mdis)
vio_silho_x_u = ["random"] * mnod.shape[0]; vio_silho_y_u = list(mnod)
print ("Silhouette\n", silhouette_str_head)
print (suffix2 + ' {:8s}'.format("random"), " ", silhouette_str(mdis), silhouette_str(mnod))
for fname, rname in zip(outfile_list, [i.split('/')[-1] for i in outfile_list]):
mdis, mnod = patristic_distances_from_treefile (fname + suffix)
print (suffix2 + ' {:8s}'.format(rname), " ", silhouette_str(mdis), silhouette_str(mnod))
bplot_label.append(rname) ; bplot_hue.append("proportion of positive silhouette scores");
bplot_value.append(sum(mnod>0)/float(len(mnod)))
vio_silho_x_w.extend([str(rname)] * mdis.shape[0]); vio_silho_y_w.extend(list(mdis))
vio_silho_x_u.extend([str(rname)] * mnod.shape[0]); vio_silho_y_u.extend(list(mnod))
## Monophyly scores
mono, fspec, mav, avv = monophyly_score_from_treefile (rand_tfile, tbl_col = table_col, shuffle = True)
bplot_label.append("random") ; bplot_hue.append("proportion of monophyletic groups")
bplot_value.append(sum(np.array(fspec) > 0.999)/float(len(fspec)))
vio_mono_x = ["random"] * len(mono)
vio_mono_y = mono; vio_mav = mav; vio_fspec = fspec; vio_avv = avv;
print ("\nMonophyly\n", mono_str_head)
print (suffix2 + ' {:8s}'.format("random"), " ", mono_str(mono, fspec, mav, avv))
for fname, rname in zip(outfile_list, [i.split('/')[-1] for i in outfile_list]):
mono, fspec, mav, avv = monophyly_score_from_treefile (fname + suffix)
print (suffix2 + ' {:8s}'.format(rname), " ", mono_str(mono, fspec, mav, avv))
bplot_label.append(rname) ; bplot_hue.append("proportion of monophyletic groups")
bplot_value.append(sum(np.array(fspec) > 0.999)/float(len(fspec)))
vio_mono_x.extend([rname] * len(mono))
vio_mono_y.extend(mono); vio_mav.extend(mav); vio_fspec.extend(fspec); vio_avv.extend(avv);
matplotlib.rcParams['figure.dpi'] = 200
matplotlib.rc('font', weight='bold')
sns.set(); sns.set_palette("cubehelix", 12);
data = pandas.DataFrame([i for i in zip(vio_mono_y, vio_mav, vio_mono_x)], columns=["mono","max","segment"])
g = sns.lmplot(x="mono", y="max", hue="segment", col="segment", data=data, col_wrap=3)
g.set(ylim=(-0.01, 0.1), xlim=(-0.01,1.05))
data = pandas.DataFrame([i for i in zip(vio_silho_y_u, vio_silho_y_w, vio_silho_x_w)], columns=["unw","weight","segment"])
g=sns.lmplot(x="unw", y="weight", hue="segment", col="segment", data=data, col_wrap=3)
g.set(ylim=(-1, 1), xlim=(-1,1))
fig, axes = plt.subplots(3,1) ; fig.set_size_inches(16, 24);
matplotlib.rcParams['figure.dpi'] = 200
matplotlib.rc('font', weight='bold')
fig.subplots_adjust(top=.91, bottom=.01, left=.02, right=.99, wspace=.2, hspace=.2)
sns.set(); sns.set_palette("cubehelix", 12);
sns.boxenplot(x=vio_silho_x_w, y=vio_silho_y_w, ax = axes[0], outlier_prop=0.00001)
axes[0].set_ylabel("tree silhouette scores (weighted)", fontsize=22, weight="bold")
axes[0].set_xticklabels(axes[0].get_xticklabels(),rotation=40)
axes[0].tick_params(labelsize=16)
axes[0].set_title(title_list[table_col])
sns.set(); sns.set_palette("cubehelix", 12)
sns.pointplot(x=vio_mono_x, y=vio_mono_y, ax = axes[1], linestyles="--", errwidth=0.5, color="gray", scale=0.4)
sns.swarmplot(x=vio_mono_x, y=vio_mono_y, ax = axes[1], size=10, linewidth=1)
axes[1].set_ylabel("monophyly scores", fontsize=22, weight="bold")
axes[1].set_xticklabels(axes[1].get_xticklabels(),rotation=40)
axes[1].tick_params(labelsize=16)
sns.set(); sns.set_palette("cubehelix", 12) # "Average distance between samples from best monophyletic subtree"
sns.boxplot(x=vio_mono_x, y=vio_mav, ax = axes[2])
axes[2].set_ylabel("Max average clade distance", fontsize=22, weight="bold")
axes[2].set_xticklabels(axes[2].get_xticklabels(),rotation=40)
axes[2].set_ylim(-0.0001,0.05)
axes[2].tick_params(labelsize=16)
fig, axes = plt.subplots(1) ; fig.set_size_inches(12, 6);
fig.subplots_adjust(top=.99, bottom=.01, left=.01, right=.98, wspace=.2, hspace=.2)
sns.set(); sns.set_palette("cubehelix", 12)
sns.barplot(hue=bplot_label, y=bplot_value, x=bplot_hue, ax=axes)
axes.set_ylabel("proportion", fontsize=18, fontweight="bold")
axes.tick_params(labelsize=14)
axes.set_title(title_list[table_col])
## extra figures
fig, axes = plt.subplots(3,1) ; fig.set_size_inches(16, 20);
matplotlib.rcParams['figure.dpi'] = 200
matplotlib.rc('font', weight='bold')
fig.subplots_adjust(top=.92, bottom=.01, left=.01, right=.99, wspace=.15, hspace=.15)
sns.set(); sns.set_palette("cubehelix", 12);
sns.boxenplot(x=vio_silho_x_u, y=vio_silho_y_u, ax = axes[0], outlier_prop=0.00001)
axes[0].set_ylabel("tree silhouette scores (unweighted)", fontsize=18, weight="bold")
axes[0].set_xticklabels(axes[0].get_xticklabels(),rotation=30)
axes[0].tick_params(labelsize=14)
axes[0].set_title(title_list[table_col])
sns.set(); sns.set_palette("cubehelix", 12)
sns.pointplot(x=vio_mono_x, y=vio_fspec, ax = axes[1], linestyles="--", errwidth=0.5, color="gray", scale=0.3)
sns.swarmplot(x=vio_mono_x, y=vio_fspec, ax = axes[1], size=10, linewidth=1)
axes[1].set_ylabel("inverse number of species below MRCA", fontsize=18, weight="bold")
axes[1].set_xticklabels(axes[1].get_xticklabels(),rotation=30)
axes[1].tick_params(labelsize=14)
sns.set(); sns.set_palette("cubehelix", 12)
sns.boxplot(x=vio_mono_x, y=vio_avv, ax = axes[2])
axes[2].set_ylabel("Mean average clade distance", fontsize=18, weight="bold")
axes[2].set_xticklabels(axes[2].get_xticklabels(),rotation=30)
axes[2].set_ylim(-0.0001,0.02)
axes[2].tick_params(labelsize=14)